*-------------------------------------------------------------------------------*
* DIRECTORIES
*-------------------------------------------------------------------------------*
local l_direc_data `1'
local l_direc_code `2' 
local l_direc_figtab `3' 

local direc_data_raw  `l_direc_data'/raw
local direc_data_clean `l_direc_data'/clean



*-------------------------------------------------------------------------------*
* READ AUTHOR-ITY
*-------------------------------------------------------------------------------*
qui forval i=1/105 {
	if `i'==105 {
		local j final
	}
	else {
		local j `i'
	}
	
	import delimited "`direc_data_raw'/1_authority_pmidinfo_`j'.csv", ///
		bindquote(strict) varnames(1) encoding(UTF-8) clear 

	keep pmid year info journal
	
	cap destring pmid , replace force
	cap destring year , replace force
	drop if year==. | pmid==.
	
	gen pmid_art = regexm(info, "(Journal Article)")
	drop info
	gen pmid_oth = !pmid_art
	rename year pmid_year
	
	tempfile T`i'
	save `T`i'', replace
	nois di "`i' in"
}

use `T1', clear
forval i=2/105 {
	append using `T`i'' 
	nois di "`i' appended"
	if regexm("`i'","(20|40|60|80)") {
		gduplicates drop
	}
}

foreach l_punc in . , : the {
	replace journal = subinstr(journal,"`l_punc'"," ",.)
}
replace journal = upper(itrim(trim(itrim(trim(journal)))))
gduplicates drop
gen j = journal
drop journal
rename j journal

gduplicates drop
compress

preserve
	import delimited "`direc_data_raw'/authority_pmidcites.csv", clear
	rename pm pmid
	drop v1
	rename t pmid_cittot
	rename a pmid_citpry
	gduplicates drop pmid , force
	tempfile TC
	save `TC', replace
restore
joinby pmid using `TC', unmatched(master)
tab _m
drop _m

compress
save "`direc_data_clean'/pmid_info.dta", replace



*-------------------------------------------------------------------------------*
* READ JOURNAL INFO
*-------------------------------------------------------------------------------*
import delimited "`direc_data_raw'/journal_list.csv", clear varnames(1)

rename nlmid journal_nlmid
replace journal_nlmid = "0" + journal_nlmid if regexm(leading,"should have 0")
replace journal_nlmid = "00" + journal_nlmid if regexm(leading,"should have two 0")

keep journal journal_nlmid
foreach l_punc in . , : the {
	replace journal = subinstr(journal,"`l_punc'"," ",.)
}
replace journal = upper(itrim(trim(itrim(trim(journal)))))

gduplicates drop

compress
tempfile T_j
save `T_j', replace //journal journal_nlmid | unique @ journal


* PMIDS BY JOURNAL
use "`direc_data_clean'/pmid_info.dta" if pmid_year >= 1950 & pmid_year <= 2008, clear

joinby journal using `T_j', unmatched(master) // only need mesh for pmid of journals, or out
replace journal_nlmid = "" if _m!=3
drop _m journal
gduplicates drop

foreach l_var of varlist pmid_cittot pmid_citpry {
	replace `l_var' = 0 if `l_var'==. & journal_nlmid != ""
	replace `l_var' = . if journal_nlmid == ""
}

compress
tempfile T_jp
save `T_jp', replace // pmid pmid_year pmid_art pmid_oth journal_nlmid | unique @ pmid



*-------------------------------------------------------------------------------*
* PMID MESH INFO
*-------------------------------------------------------------------------------*
qui forval i=1/104 {
	use "`direc_data_raw'/1_authority_pmidinfo_`i'_clean.dta", clear
	rename mesh mesh_term
	drop if mesh_term == ""
	
	joinby pmid using `T_jp' // only need mesh for pmid of journals (but includes all journals...)
	
	cap destring pmid , replace force
	drop if pmid == .
	
	bysort pmid: gen N_mesh = _N
	gen frcnt = 1/N_mesh
	drop N_mesh

	gen frcnt_all = frcnt * 1
	gen frcntcit1_all = frcnt * 1 * pmid_cittot
	gen frcntcit2_all = frcnt * 1 * pmid_citpry

	gen frcnt_art = frcnt * pmid_art
	gen frcntcit1_art = frcnt * pmid_art * pmid_cittot
	gen frcntcit2_art = frcnt * pmid_art * pmid_citpry

	gen frcnt_oth = frcnt * pmid_oth

	rename pmid_year year

	collapse (sum) frcnt* , by(mesh_term journal_nlmid year)

	compress
	tempfile T`i'
	save `T`i'', replace
	nois di "`i' $S_TIME"
}
use `T1', clear
forval i=2/104 {
	append using `T`i''
}
compress
collapse (sum) frcnt* , by(mesh_term journal_nlmid year)

foreach l_var of varlist frcnt* {
	egen denom = sum(`l_var') , by(journal_nlmid year)
	
	gen jshr_`l_var' = `l_var'/denom
	recode jshr_`l_var' .=0
	
	drop denom
}

preserve
	collapse (sum) jshr_* , by(journal_nlmid year)
	foreach l_var of varlist jshr_* {
		su `l_var'
		su `l_var' if `l_var' > 0 //TEST FOR SHARE = 1
	}
restore

replace mesh_term = lower(mesh_term)

replace journal_nlmid = "NA" if journal_nlmid == ""

sort journal_nlmid year mesh_term 
compress
save "`direc_data_clean'/p_shares_mjt.dta", replace //unbalanced 



*-------------------------------------------------------------------------------*
* EDITOR INFO
*-------------------------------------------------------------------------------*
import delimited "`direc_data_raw'/author_editor_crosswalk.csv", clear
	
keep author_id journal_nlmid editor_startdate editor_enddate

rename editor_startdate editor_startyear
rename editor_enddate editor_endyear

compress
save "`direc_data_clean'/editor_info.dta", replace

keep author_id
gduplicates drop

tempfile T_ea
save `T_ea', replace //author_id



*-------------------------------------------------------------------------------*
* PREP AUTHOR-PMID
*-------------------------------------------------------------------------------*
local l_filelist : dir "`direc_data_raw'/authority_long" files "*.dta"
local cnt = 1
qui foreach l_file in `l_filelist' {
	
	if `cnt'==1 | `cnt'==1000 | `cnt'==2000 | `cnt'==3000 | `cnt'==4000 | `cnt'==5000 | `cnt'==6000 | `cnt'==7000 | `cnt'==8000 | `cnt'==9000 {
		use "`direc_data_raw'/authority_long/`l_file'", clear
	}
	else {
		append using "`direc_data_raw'/authority_long/`l_file'"
	}
	
	if `cnt'==999 | `cnt'==1999 | `cnt'==2999 | `cnt'==3999 | `cnt'==4999 | `cnt'==5999 | `cnt'==6999 | `cnt'==7999 | `cnt'==8999 {
		
		joinby author_id using `T_ea' // only need pmid of editors
		
		tempfile T`cnt'
		save `T`cnt'', replace
		clear
	}
	
	nois di "`cnt'"
	local cnt = `cnt'+1
}
foreach cnt in 999 1999 2999 3999 4999 5999 6999 7999 8999 {
	append using `T`cnt''
}
cap destring pmid , replace force
drop if pmid == .

gduplicates drop
compress
tempfile T_ap
save `T_ap', replace //author_id pmid | unique @ author_id pmid



*-------------------------------------------------------------------------------*
* PREP PMID-MESH FOR EDITORS
*-------------------------------------------------------------------------------*
qui forval i=1/104 {
	use "`direc_data_raw'/1_authority_pmidinfo_`i'_clean.dta", clear
	rename mesh mesh_term
	
	joinby pmid using `T_ap' // only need mesh for pmid of editors
	
	tempfile T`i'
	save `T`i'', replace
	nois di "`i' $S_TIME"
}
use `T1', clear
forval i=2/104 {
	append using `T`i''
}
cap destring pmid , replace force
drop if pmid == .

gduplicates drop
compress
tempfile T_ep_m
save `T_ep_m', replace



*-------------------------------------------------------------------------------*
* EDITOR SHARES: PRE-TENURE
*-------------------------------------------------------------------------------*
use "`direc_data_clean'/editor_info.dta", clear
keep author_id journal_nlmid editor_startyear

joinby author_id using `T_ap'

joinby pmid using "`direc_data_clean'/pmid_info.dta"
gduplicates drop

keep if pmid_year < editor_startyear
drop pmid_year editor_startyear

joinby pmid using `T_ep_m'
gduplicates drop

drop if mesh_term == ""

bysort author_id journal_nlmid pmid: gen N_mesh = _N
gen frcnt = 1/N_mesh
drop N_mesh

gen frcnt_all = frcnt * 1
gen frcnt_art = frcnt * pmid_art
gen frcnt_oth = frcnt * pmid_oth
drop frcnt

collapse (sum) frcnt_* , by(author_id journal_nlmid mesh_term)

foreach l_var of varlist frcnt_* {
	egen denom = sum(`l_var') , by(author_id journal_nlmid)
	gen eshr1_`l_var' = `l_var'/denom
	recode eshr1_`l_var' .=0
	
	drop denom
	
	gen eshr2_`l_var' = `l_var' // will make shares later
}

keep author_id journal_nlmid mesh_term eshr* 

preserve
	collapse (sum) eshr* , by(author_id journal_nlmid)
	foreach l_var of varlist eshr* {
		su `l_var'
		su `l_var' if `l_var' > 0 //TEST FOR SHARE = 1 for eshr1*
	}
restore

compress
save "`direc_data_clean'/e_shares_ejm.dta", replace // not balanced



*-------------------------------------------------------------------------------*
* PREP ED-JOURNAL-YEAR
*-------------------------------------------------------------------------------*
use "`direc_data_clean'/editor_info.dta", clear

keep author_id
gduplicates drop

gen year = _n+1950
replace year = . if !(year >= 1950 & year <= 2008)
fillin author_id year
drop _f
keep if year >= 1950 & year <= 2008
drop if year == . 

joinby author_id using "`direc_data_clean'/editor_info.dta"

gcollapse (min) editor_startyear (max) editor_endyear , by(author_id journal_nlmid year)

keep if year >= editor_startyear & year <= editor_endyear


* MAKE EDITOR SHARES
joinby author_id journal_nlmid using "`direc_data_clean'/e_shares_ejm.dta"

preserve
	collapse (sum) eshr* , by(author_id journal_nlmid year)
	foreach l_var of varlist eshr* {
		su `l_var'
		su `l_var' if `l_var' > 0 //TEST FOR SHARE = 1 for eshr1*
	}
restore

foreach l_var of varlist eshr* {
	egen denom = sum(`l_var'), by(journal_nlmid year)
	replace `l_var' = `l_var' / denom
	drop denom
}

collapse (sum) eshr* , by(journal_nlmid year mesh_term)

preserve
	collapse (sum) eshr* , by(journal_nlmid year)
	foreach l_var of varlist eshr* {
		su `l_var'
		su `l_var' if `l_var' > 0 //TEST FOR SHARE = 1 for eshr1* and for eshr2*
	}
restore
	
replace mesh_term = lower(mesh_term)

sort journal_nlmid year mesh_term 
compress
save "`direc_data_clean'/e_shares_mjt.dta", replace // not balanced



*-------------------------------------------------------------------------------*
* JOURNAL-YEAR-MESH FILLINED
*-------------------------------------------------------------------------------*
use "`direc_data_clean'/p_shares_mjt.dta", clear

keep journal_nlmid mesh_term year

drop if mesh_term == "" | year == .

fillin journal_nlmid mesh_term year
drop _f

drop if year > 2008
drop if journal_nlmid == "9421642" & year < 1978
drop if journal_nlmid == "9419065" & year < 1993
drop if journal_nlmid == "9304532" & year < 1996
drop if journal_nlmid == "8303128" & year < 1981
drop if journal_nlmid == "8301365" & year < 1983
drop if journal_nlmid == "7501160" & year < 1980
drop if journal_nlmid == "2985191R" & year < 1978
drop if journal_nlmid == "1310650" & year < 1976
drop if journal_nlmid == "1300217" & year < 1975
drop if journal_nlmid == "0410462" & year < 1983
drop if journal_nlmid == "0372541" & year < 1981
drop if journal_nlmid == "0372354" & year < 1972
drop if journal_nlmid == "0372351" & year < 1969
drop if journal_nlmid == "0255562" & year < 1976
drop if journal_nlmid == "0217410" & year < 1975
drop if journal_nlmid == "0147763" & year < 1981
drop if journal_nlmid == "0047103" & year < 1980
  
drop if journal_nlmid == "0401260" & year < 1964
drop if journal_nlmid == "" & year < 1964

//based on pub data
drop if journal_nlmid == "9419065" & year < 1998
drop if journal_nlmid == "0410462" & year < 1988 

preserve
	import delimited "`direc_data_raw'/journal_list.csv", clear varnames(1)
	rename nlmid journal_nlmid
	replace journal_nlmid = "0" + journal_nlmid if regexm(leading,"should have 0")
	replace journal_nlmid = "00" + journal_nlmid if regexm(leading,"should have two 0")
	
	keep journal_nlmid journal
	rename journal journal_name
	gen len = strlen(journal_name)
	sort journal_nlmid len
	by journal_nlmid: drop if _n > 1
	keep journal_nlmid journal_name
	tempfile Tj
	save `Tj', replace
restore
joinby journal_nlmid using `Tj', unmatched(master) //keep out-of-sample
drop _m 

preserve
	import delimited "`direc_data_raw'/cleaned_mesh_trees.csv", varnames(1) clear
	compress 
	duplicates drop
	tempfile Tm
	save `Tm', replace
restore
joinby mesh_term using `Tm'

preserve
	import delimited "`direc_data_raw'/cleaned_mesh_trees.csv", varnames(1) clear
	
	keep mesh_tree
	gduplicates drop

	local l_lev = 1
	foreach l_cut in 1 3 7 11 15 19 23 27 31 35 39 43 47 51 {
		gen mesh`l_lev'_tree = substr(mesh_tree,1,`l_cut') if strlen(mesh_tree) >=`l_cut'
		local l_lev = 1+`l_lev'
	}

	gen mesh_branchlev = .
	forval l_lev =1/14 {
		replace mesh_branchlev = `l_lev' if mesh`l_lev'_tree != ""
	}

	forval l_lev=2/14 {
		egen tmp1 = max(mesh_branchlev), by(mesh`l_lev'_tree)
		gen tmp2 = mesh_branchlev / tmp1 if mesh_branchlev == `l_lev'
		egen mesh`l_lev'_branchperc = max(tmp2), by(mesh`l_lev'_tree)
		drop tmp1 tmp2
	}

	keep mesh_tree mesh*_branchperc
	gduplicates drop mesh_tree , force

	egen mesh_branchperc = rowmax(mesh*_branchperc)

	keep mesh_tree mesh*_branchperc
	compress
	tempfile Tm
	save `Tm', replace

restore
joinby mesh_tree using `Tm'

preserve

	use "`direc_data_clean'/p_shares_mjt.dta", clear
	gcollapse (min) mesh_firstyear=year , by(mesh_term)
	compress
	tempfile Tm
	save `Tm', replace

restore
joinby mesh_term using `Tm'

keep if year >= mesh_firstyear
gen mesh_yearspostfirst = year - mesh_firstyear
	
replace journal_name = "Rest of PubMed" if journal_name==""
replace journal_nlmid = "NA" if journal_nlmid == ""

compress
save "`direc_data_clean'/mjt_base.dta" , replace



*-------------------------------------------------------------------------------*
* REG DATA BUILD AGG LOOP
*-------------------------------------------------------------------------------*
use "`direc_data_clean'/mjt_base.dta", clear

joinby journal_nlmid mesh_term year using "`direc_data_clean'/e_shares_mjt.dta", unmatched(master)
drop _m

joinby journal_nlmid mesh_term year using "`direc_data_clean'/p_shares_mjt.dta", unmatched(master)
drop _m

*REMOVE NATURE, HEALTH AFFAIRS, J NEURO NEURO PSYCH SINCE BAD EDITOR DATA
drop if journal_nlmid=="8303128" | journal_nlmid=="2985191R" | journal_nlmid=="0410462"
	
tempfile Tstart
save `Tstart', replace

foreach l_meshcollapseperc in 0.10 0.25 0.50 0.75 1.00 { //

	local l_meshaggflag = subinstr("`l_meshcollapseperc'",".","",.)
	
	use `Tstart', clear

	if `l_meshcollapseperc' < 1 {
		
		preserve
			keep mesh_tree mesh_term 
			gduplicates drop mesh_tree, force
			tempfile Tm1
			save `Tm1', replace
		restore
		
		preserve
			keep journal_name journal_nlmid year
			gduplicates drop
			tempfile Tjy
			save `Tjy', replace
		restore
		
		gen mesh = ""
		forval i=14(-1)2 {
			local l_cut = -1 + ((`i'-1)*4)
			replace mesh = substr(mesh_tree,1,`l_cut') if mesh`i'_branchperc > `l_meshcollapseperc' 
		}
		
		preserve
			keep mesh mesh_yearspostfirst
			rename mesh mesh_tree
			gcollapse 	(mean) mea_mesh_yearspostfirst=mesh_yearspostfirst ///
						(min) min_mesh_yearspostfirst=mesh_yearspostfirst ///
						(sd) sd_mesh_yearspostfirst=mesh_yearspostfirst ///
						(max) max_mesh_yearspostfirst=mesh_yearspostfirst ///
						, by(mesh_tree)
						
			compress			
			tempfile Tm2
			save `Tm2', replace
		restore
		
		gcollapse 	(sum) es* js* , by(mesh journal_name year)		
		rename mesh mesh_tree
		
		joinby mesh_tree using `Tm1' , unmatched(master) 
		tab _m
		drop _m // some may not have names, that's ok
		
		joinby mesh_tree using `Tm2' , unmatched(master) 
		tab _m
		drop _m // some may not have yearinfo, that's ok
		
		joinby journal_name year using `Tjy' , unmatched(using) 
		tab _m
		drop _m // unmatched zeros
		
		drop if mesh_tree=="" | journal_name=="" | journal_nlmid==""
	}
	
	foreach l_var of varlist es* js* {
		recode `l_var' .=0
	}
	
	egen fe_j = group(journal_name)
	gen fe_t = year
	egen fe_m = group(mesh_tree)

	egen fe_jm = group(fe_j fe_m)
	egen fe_jt = group(fe_j fe_t)
	egen fe_mt = group(fe_m fe_t)
		
	compress

	save "`direc_data_clean'/mjt_regready_agg`l_meshaggflag'.dta", replace

}



